In [ ]:
import pandas as pd
import cufflinks as cf
import plotly as py
import plotly.graph_objs as go
import statsmodels.api as sm
import patsy
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
import numpy as np
from sklearn import tree

py.offline.init_notebook_mode()
cf.set_config_file(offline=True)

Import data

In [ ]:
df = pd.read_csv('data/elastography.csv')
df.rename(columns={'M/F':'sex', 'Colour ':'Color', 'Av Strain':'AV_Strain'}, inplace=True)
In [ ]:
df.ix[40,'Classification'] = 0
In [ ]:
benign = go.Histogram(x=df.Age.loc[df.Classification == 0], opacity=0.75,name='benign')
malignant = go.Histogram(x=df.Age.loc[df.Classification == 1], opacity=0.75, name='malignant')
lipoma = go.Histogram(x=df.Age.loc[df.Classification == 2], opacity=0.75, name='lipoma')
atypical_lipoma = go.Histogram(x=df.Age.loc[df.Classification == 3], opacity=0.75, name='atypical lipoma')
data = [benign, malignant, lipoma, atypical_lipoma]
layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
cols = list(df.columns)
cols.remove('Study ID')
df[cols].iplot(kind='histogram', subplots=True, shape(6,2))
  File "<ipython-input-4-90c9dc5fd14a>", line 10
    df[cols].iplot(kind='histogram', subplots=True, shape(6,2))
SyntaxError: non-keyword arg after keyword arg
In [ ]:
age_dummies = pd.get_dummies(df['Age']).rename(columns=lambda x: 'Age_' + str(x))
size_dummies = pd.get_dummies(df['Size']).rename(columns=lambda x: 'Size_' + str(x))
depth_dummies = pd.get_dummies(df['Depth']).rename(columns=lambda x: 'Depth_' + str(x))
contrast_dummies = pd.get_dummies(df['Contrast']).rename(columns=lambda x: 'Contrast_' + str(x))
wall_dummies = pd.get_dummies(df['Wall']).rename(columns=lambda x: 'Wall_' + str(x))
doppler_dummies = pd.get_dummies(df['Doppler']).rename(columns=lambda x: 'Doppler_' + str(x))
elastography_dummies = pd.get_dummies(df['Elastography']).rename(columns=lambda x: 'Elastography_' + str(x))
sex_dummies = pd.get_dummies(df['sex']).rename(columns=lambda x: 'Sex_' + str(x))
color_dummies = pd.get_dummies(df['Color']).rename(columns=lambda x: 'Color_' + str(x))
site_dummies = pd.get_dummies(df['Site']).rename(columns=lambda x: 'Site_' + str(x))
strain_dummies = pd.get_dummies(df['AV_Strain']).rename(columns=lambda x: 'Strain_' + str(x))

dummies_list = [age_dummies,size_dummies,depth_dummies,contrast_dummies,wall_dummies,doppler_dummies,elastography_dummies,sex_dummies,color_dummies,site_dummies,strain_dummies]
var_names = []
for var in dummies_list:
    for col in var:
        var_names.append(col) 

old_new_map = {0:0,1:0,2:0,3:1}
df['Binary_Classification'] = df['Classification'].map(old_new_map)
In [ ]:
readable_var_names_mapper = {
    'Age_0':'Age < 20',
    'Age_1':'Age 21-30',
    'Age_2':'Age 31-40',
    'Age_3':'Age 41-50',
    'Age_4':'Age 51-60',
    'Age_5':'Age 61-70',
    'Age_6':'Age >71',
    'Size_0':'Size 0-5',
    'Size_1':'Size 6 - 10',
    'Size_2':'Size 11 - 20',
    'Size_3':'Size > 21',
    'Depth_0':'Depth = superficial',
    'Depth_1':'Depth = deep',
    'Contrast_0':'Contrast = hypo',
    'Contrast_1':'Contrast = iso',
    'Contrast_2':'Contrast = hyper',
    'Contrast_3':'Contrast = complex',
    'Wall_0':'Wall = regular',
    'Wall_1':'Wall = irregular',
    'Doppler_0':'Doppler = No',
    'Doppler_1':'Doppler = Yes',
    'Elastography_0':'Elastography = homo',
    'Elastography_1':'Elastography = hetero',
    'Sex_0':'Sex = male',
    'Sex_1':'Sex = female',
    'Color_0':'Color = green',
    'Color_1':'Color = green/yellow',
    'Color_2':'Color = green/red',
    'Color_3':'Color = green/yellow/blue',
    'Color_4':'Color = green/blue',
    'Color_5':'Color = blue',
    'Color_6':'Color = green/black',
    'Color_7':'Color = green/blue/black',
    'Color_8':'Color = blue/black',
    'Color_9':'Color = black',
    'Site_0':'Site = head/neck',
    'Site_1':'Site = upper torso',
    'Site_2':'Site = lower torso',
    'Site_3':'Site = arm',
    'Site_4':'Site = leg',
    'Strain_0':'Strain < 1',
    'Strain_1':'Strain 1-5',
    'Strain_2':'Strain 6-10',
    'Strain_3':'Strain > 11'
}

def prettify_var_names(var_list):
    readable_var_list = []
    for item in var_list:
        readable_var_list.append(readable_var_names_mapper[item])
    return readable_var_list

readable_var_names = prettify_var_names(var_names)
In [ ]:
df_rc = pd.DataFrame()
df_rc['y'] = df['Binary_Classification']

df_rc = df_rc.join(age_dummies)
df_rc = df_rc.join(size_dummies)
df_rc = df_rc.join(depth_dummies)
df_rc = df_rc.join(contrast_dummies)
df_rc = df_rc.join(wall_dummies)
df_rc = df_rc.join(doppler_dummies)
df_rc = df_rc.join(elastography_dummies)
df_rc = df_rc.join(sex_dummies)
df_rc = df_rc.join(color_dummies)
df_rc = df_rc.join(site_dummies)
df_rc = df_rc.join(strain_dummies)

Explore classifier for binary outcome, that is whether or not malignant

In [ ]:
y = df_rc['y']
x_cols = list(df_rc.columns)
x_cols.remove('y')
X = df_rc[x_cols]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
In [ ]:
clf = RandomForestClassifier()
clf.fit(X_train,y_train.ravel())
In [ ]:
clf.score(X_test,y_test)
In [ ]:
features = clf.feature_importances_
features = features * 100
var_dict = sorted(zip(map(lambda x: round(x,4), features), readable_var_names),reverse=True)
sorted_idx = features.argsort()
pos = np.arange(sorted_idx.shape[0]) + .5
feature_ranks = sorted(zip(features[sorted_idx], np.asanyarray(prettify_var_names(x_cols))[sorted_idx]),reverse=True)
In [ ]:
df_features = pd.DataFrame(feature_ranks,columns=['contribution %','feature'])
df_features.set_index(['feature'],inplace=True)
df_features.iplot(kind='barh',legend=True)
In [ ]:
predictions = clf.predict(X_test)
In [ ]:
y_vals = y_test.ravel()
In [ ]:
 
In [ ]:
confusion_matrix = pd.crosstab(predictions, y_vals, rownames=['y_vals'], colnames=['predictions'])
In [ ]:
rf_metrics = {
    'sensitivity': float(confusion_matrix[1][1]) / (float(confusion_matrix[0][1] + confusion_matrix[1][1])),
    'specificity': float(confusion_matrix[0][0]) / (float(confusion_matrix[0][0] + confusion_matrix[1][0])),
    'accuracy': float(confusion_matrix[0][0] + confusion_matrix[1][1]) / float(confusion_matrix[0][0] + confusion_matrix[0][1] + confusion_matrix[1][0] + confusion_matrix[1][1])
}
In [ ]:
rf_metrics
In [ ]:
i = 0
for est in clf.estimators_:
    tree.export_graphviz(est,out_file='tree_{}.dot'.format(str(i)))
    i += 1
In [ ]: